#!/usr/bin/env python
from lxml import etree
import requests


url = 'https://www.maoyan.com/films/'

headers = {
    'User_Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
}

response = requests.get(url,headers=headers)

#response.text 是系统自己默认判断。但很遗憾判断错误，导致乱码出现。我们可以采取另外方式 response.content。自己指定格式解码
#print(response.text)
#print(response.content.decode('gbk'))
print(response.content.decode(encoding="utf-8", errors="ignore"))

dom = etree.HTML(response.text)
detail_urls = dom.xpath("//dl[@class='movie-list']/dd/div/a/@href")
print(len(detail_urls))
for detail_url in detail_urls:
    print(detail_url)  #加上域名即为详情 url